Sense change through collocations¶

Module imports¶

Let's first import all the required packages.

In [1]:
# general
import os
import numpy as np
import pandas as pd
import time
import re
import itertools
import pickle
In [2]:
# clustering algorithms, distance metrics
from sklearn.cluster import DBSCAN,KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
import scipy.cluster.hierarchy as sch
# networks
import networkx as nx
from pyvis import network
from pyvis.network import Network
In [3]:
# plotting
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
In [28]:
# corpus readers, coocurrences and word vectors
from nltk.collocations import *
import nltk
import pickle
#from cltk.stops import lat as lat_stops
#import cltk
from nltk.util import skipgrams
from nltk.lm import NgramCounter
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.similarities import MatrixSimilarity

from utils.data.buildCollocs import BuildCollocs
from utils.data.readCorpus import NltkCorpusFromDir, CorpusFromDir, NltkCorpusFromList
from utils.data.buildModels import BuildModels
from utils.data.collDiffs import collDiffs
from nltk.corpus.reader.plaintext import PlaintextCorpusReader, CategorizedPlaintextCorpusReader
from nltk.tokenize.simple import SpaceTokenizer, LineTokenizer
from nltk.text import Text, TextCollection

If the rebuild parameter is set to True we will rerun the most resource-consuming code instead of reading the pre-computed variables from pickles.

In [5]:
rebuild = False

Terms¶

We are interested in semantic change patterns of a set of terms related to the socio-political life, such as:

In [6]:
# define terms we're interested in
socio_political_terms = ["civitas", "consilium", "consul", "dux", "gens", "hostis", "imperator",
                         "jus", "labor", "natio", "nobilitas", "pontifex", "pontificium", "populus", "potestas", "regnum", "senatus", "sodes", "urbs"]
print(socio_political_terms)
['civitas', 'consilium', 'consul', 'dux', 'gens', 'hostis', 'imperator', 'jus', 'labor', 'natio', 'nobilitas', 'pontifex', 'pontificium', 'populus', 'potestas', 'regnum', 'senatus', 'sodes', 'urbs']

We're assigning each term a seperate colour to facilitate our analyses.

In [7]:
color_discrete_map_terms = { term : px.colors.qualitative.Alphabet[i] for i, term in enumerate(socio_political_terms)} # for each term we fix a color
fig = go.Figure()
fig.add_trace(go.Bar(
    x = [col for col in color_discrete_map_terms.keys()],
    y = [0.5 for x in range(0, len(color_discrete_map_terms)) ],
    text = socio_political_terms,
    textangle=90,
    marker_color=[col for col in color_discrete_map_terms.values()]
))
fig.update_layout(showlegend=False, xaxis={'showgrid': False, 'visible': False}, yaxis={'showgrid': False, 'visible': False})

The corpus¶

The corpus processing phase follows as close as possible BMG's workflow to keep models compatible. There are 2 exceptions:

  • all lemmas are converted to lowercase and
  • anomalous lemmas (mostly punctuation) are added to stopword list.
In [8]:
# prepare the corpus
punctuation = ['.', ',', '...', ';', ':', '?', '(', ')', '-', '!', '[', ']', '"', "'", '""', '\n']
In [9]:
# corpus files
#dir_in = os.path.join("/home/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas/")
dir_input =  os.path.join("/home/krzys/Kod/lvlt22/BMG/LatinISE_1/") # includes texts first omitted due to parsing issues
dir_in = os.path.join(dir_input, "preprocessed_lemmas")
dir_in_words = os.path.join(dir_input, "preprocessed_tokens")
files = os.listdir(os.path.join(dir_in))
files = [f for f in files[:] if "IT" in f]

Process the metadata¶

We'll be storing corpus metadata in a data frame.

In [10]:
# metadata (BMG)
metadata_df = pd.read_csv(os.path.join(dir_input, 'latinise_metadata.csv'), sep = ",")
metadata_df = metadata_df[metadata_df['id'].str.startswith("IT")]
metadata_df.head()
metadata_df["date"] = metadata_df["date"].astype('int') #ensure we're working with integers
In [11]:
first_date = min(metadata_df.date)
In [12]:
last_date = 900 # BMG

Define size of the time intervals:

In [13]:
size_interval = 450 # BMG

So there are

In [14]:
n_intervals = round((last_date-first_date)/size_interval) # BMG
n_intervals
Out[14]:
3

time intervals.

Define the time periods and split the corpus:

In [66]:
intervals = [None]*(n_intervals+1) # BMG
for t in range(n_intervals+1):
    #print(t)
    if t == 0:
        intervals[t] = first_date
    else:
        intervals[t] = intervals[t-1]+size_interval
    #print(intervals[t])
    
print(intervals)
periods_labels = [ str(p1) + '-' + str(p2) for p1, p2 in zip(intervals, intervals[1:]) ]
print(periods_labels)
[-450, 0, 450, 900]
['-450-0', '0-450', '450-900']

Add a column to the metadata_df for the time interval:

In [16]:
metadata_df['time_interval'] = ""
for t in range(len(intervals)-1):
    print(t)
    print(range(intervals[t],intervals[t+1]))
    metadata_df_t = metadata_df.loc[metadata_df['date'].isin(range(intervals[t],intervals[t+1]))]
    print(metadata_df_t.date)
    metadata_df.loc[metadata_df['date'].isin(range(intervals[t],intervals[t+1])),'time_interval'] = intervals[t]
metadata_df
0
range(-450, 0)
19      -9
34     -49
39     -45
42     -49
57     -80
      ... 
635   -149
638   -107
642    -37
643    -37
649   -229
Name: date, Length: 77, dtype: int64
1
range(0, 450)
18     382
23     399
24     391
37     158
38      49
      ... 
682    382
683    116
684    116
685    116
686    116
Name: date, Length: 235, dtype: int64
2
range(450, 900)
20      524
102     800
104     800
105     800
106     800
       ... 
609     598
634     550
636     550
645     450
1265    533
Name: date, Length: 73, dtype: int64
Out[16]:
id title creator date type file time_interval
18 IT-LAT0001 Vulgata Hieronymus 382 poetry lat_0382_IT-LAT0001.txt 0
19 IT-LAT0537 Ars amatoria Ovidius Naso, Publius -9 poetry lat_-009_IT-LAT0537.txt -450
20 IT-LAT0011 S. Benedicti Regula Benedictus Nursianus 524 prose lat_0524_IT-LAT0011.txt 450
21 IT-LAT0012 In psalmis Davidis expositio Thomas Aquinas: Sanctus 1254 prose lat_1254_IT-LAT0012.txt
22 IT-LAT0014 Adoro te devote Thomas Aquinas: Sanctus 1254 poetry lat_1254_IT-LAT0014.txt
... ... ... ... ... ... ... ...
683 IT-LAT0534_1 De origine et situ Germanorum Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_1.txt 0
684 IT-LAT0534_2 De vita Iulii Agricolae Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_2.txt 0
685 IT-LAT0534_3 Dialogus de oratoribus Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_3.txt 0
686 IT-LAT0534_4 Historiae Tacitus, Publius (Gaius) Cornelius 116 prose lat_0116_IT-LAT0534_4.txt 0
1265 IT-LAT0202 Institutiones Iustinianus, Caesar Flavius (Imperator Iustini... 533 prose lat_0533_IT-LAT0202.txt 450

670 rows × 7 columns

In [17]:
def convert_dates(sign, date0):

    if sign == "0":
        if date0 == 0:
            final_date = "+0000"
        elif date0 < 100:
            final_date = "+" + "00" + str(date0)
            #print("1-final_date", final_date)
        elif date0 < 1000:
            final_date = "+" + "0" + str(date0)
            #print("2-final_date", final_date)
        else:
            final_date = "+" + str(date0)
            #print("3-final_date", final_date)
    else:
        if date0 == 0:
            final_date = "+0000"
        elif date0 < 100:
            final_date = str(sign) + "00" + str(date0)
            #print("1-final_date", final_date)
        elif date0 < 1000:
            final_date = str(sign) + "0" + str(date0)
            #print("2-final_date", final_date)
        else:
            final_date = str(sign) + str(date0)
            #print("3-final_date", final_date)

    if final_date.startswith("+"):
        final_date = final_date.replace("+", "")
    return final_date

Read in corpus files¶

In [18]:
# prepare the corpus
punctuation = ['.', ',', '...', ';', ':', '?', '(', ')', '-', '!', '[', ']', '"', "'", '""', '\n', '']
In [19]:
# define corpus subset
corpus_subset = metadata_df[metadata_df['date'] <= last_date].copy().reset_index(drop=True)
filenames_subset = corpus_subset['file'] # filenames were defined above to get IT files only
In [20]:
class NltkCorpusFromDirNew(PlaintextCorpusReader):
    "A subclass of NLTK PlaintextCorpusReader"
    
    word_tokenizer=SpaceTokenizer() # tokenize on whitespace
    sent_tokenizer=LineTokenizer() # assume sentence per line
    
    def __init__(
        self,
        root,
        fileids,
        encoding="utf8",        
        word_tokenizer=word_tokenizer,
        sent_tokenizer=sent_tokenizer,
        tolower=False, punctuation=None
    ):

        PlaintextCorpusReader.__init__(self, root=root, fileids=fileids, encoding=encoding,
                                       word_tokenizer=word_tokenizer,
                                       sent_tokenizer=sent_tokenizer)
        self.tolower = tolower
        self.punctuation = punctuation
        
    def _read_word_block(self, stream):
        words = []
        for i in range(20):  # Read 20 lines at a time.
            if self.punctuation is not None:
                words.extend( [ token.lower() if self.tolower == True else token for token 
                               in self._word_tokenizer.tokenize(stream.readline()) 
                               if token not in self.punctuation and token != '' 
                              ])
            else:
                words.extend( [ token.lower() if self.tolower == True else token for token in self._word_tokenizer.tokenize(stream.readline()) ])
        return words
In [21]:
#prepare the corpus
latinise = NltkCorpusFromDirNew(root=dir_in, fileids=filenames_subset,
                                punctuation=punctuation, tolower=True)
latinise_docs = []
for fileid in latinise.fileids():
    latinise_docs.append(Text(latinise.words(fileid)))
print("This corpus contains ", len(latinise_docs), " documents.")
This corpus contains  385  documents.
In [22]:
corpus = list()
for sent in latinise.sents():
    corpus.append(token.lower() for token in sent if token not in punctuation and token != '')

Splitting the corpus¶

The corpus is splitted into slices, each covering size_interval years.

In [23]:
# dictionary that maps a time interval with the list of sentences of texts in that time interval"
time2corpus = dict()

# I loop over all time intervals:
#for t in range(n_intervals+1): # remove redundant 900 interval
for t in range(n_intervals):
    files_corpus_t = list(corpus_subset.loc[corpus_subset['time_interval'] == intervals[t]]["file"])
    print("retrieving the subcorpus for interval ", intervals[t])
    sents = latinise.sents(fileids=files_corpus_t)
    sents_clean = list()
    for sent in sents:
        sents_clean.append( [ token.lower()  for token in sent if token not in punctuation and token != ''  ] )
    time2corpus[t] = sents_clean
retrieving the subcorpus for interval  -450
retrieving the subcorpus for interval  0
retrieving the subcorpus for interval  450

The time2corpus variable is a dictionary with time slices as keys. Each item is a list of sentences, each being a list of lemmas.

In [24]:
print(f'Dictionary keys are: { [ period for period in time2corpus.keys()] }')
print('First 3 sentences from the 3rd corpus slice are: ', time2corpus[2][0:2])
Dictionary keys are: [0, 1, 2]
First 3 sentences from the 3rd corpus slice are:  [['obsculta', 'o', 'filius', 'praeceptum', 'magister', 'et', 'inclino', 'auris', 'cor', 'tuus', 'et', 'admonitio', 'pius', 'pater', 'libet', 'excipe', 'et', 'efficaciter', 'comple', 'ut', 'ad', 'is', 'per', 'oboedientia', 'labor', 'redeo', 'ab', 'quo', 'per', 'inoboedientia', 'desidia', 'recedo'], ['ad', 'tu', 'ergo', 'nunc', 'ego', 'sermo', 'dirigo', 'quisquis', 'abrenuntio', 'proprius', 'voluntas', 'dominus', 'christus', 'verus', 'rex', 'militaturus', 'oboedientia', 'fortis', 'atque', 'praeclarus', 'arma', 'sumo']]

Retrieve collocations¶

Build list of n-grams and compute association strengths¶

We're going to use functions available in the nltk package. In order to do so, we first need (1) to convert corpora into the nltk-compatible format. Next, from each corpus, we are (2) retrieving lists of n-grams which (3) we feed to the so-called finders which count n-grams, filter out stopwords, and apply association strength measures to frequency counts.

In [25]:
corpus_nltk = latinise
print(f"This corpus contains {len(corpus_nltk.sents())} sentences and {len(corpus_nltk.words())} words.")
This corpus contains 318340 sentences and 5298018 words.

First, we're retrieving 50 collocations computed on non-contiguous 5-grams of each term in the entire corpus.

In [29]:
if rebuild == True:    
    collocs = list()
    for term in socio_political_terms:
        print(f"\nBuilding finder for the term: {term}")
        colls = BuildCollocs(corpus_nltk, term=term, window=5, filtering=True, top=50)
        colls.getFinder()
        print(f"Getting top 50 collocations for the term: {term}")
        tops = colls.getAllNtops()
        collocs.append((term, tops))
        
        # saving collocation sets for the next generations
        with open('collocations_all.pickle', 'wb') as f:
            pickle.dump(collocs,f)
          
elif rebuild == False:
    with open('collocations_all.pickle', 'rb') as f:
        collocs = pickle.load(f)
Building finder for the term: civitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
Getting top 50 collocations for the term: civitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consilium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: consilium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consul
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
Getting top 50 collocations for the term: consul
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: dux
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: dux
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: gens
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa68aa62c0>
<nltk.collocations.BigramCollocationFinder object at 0x7faa68aa62c0>
Getting top 50 collocations for the term: gens
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: hostis
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: hostis
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: imperator
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
Getting top 50 collocations for the term: imperator
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: jus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
Getting top 50 collocations for the term: jus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: labor
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa0dd663b0>
<nltk.collocations.BigramCollocationFinder object at 0x7faa0dd663b0>
Getting top 50 collocations for the term: labor
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: natio
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: natio
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: nobilitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa84528e80>
<nltk.collocations.BigramCollocationFinder object at 0x7faa84528e80>
Getting top 50 collocations for the term: nobilitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontifex
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa14b89c90>
<nltk.collocations.BigramCollocationFinder object at 0x7faa14b89c90>
Getting top 50 collocations for the term: pontifex
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontificium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
Getting top 50 collocations for the term: pontificium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: populus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: populus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: potestas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa84528e80>
<nltk.collocations.BigramCollocationFinder object at 0x7faa84528e80>
Getting top 50 collocations for the term: potestas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: regnum
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa0dd663b0>
<nltk.collocations.BigramCollocationFinder object at 0x7faa0dd663b0>
Getting top 50 collocations for the term: regnum
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: senatus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa14b89c90>
<nltk.collocations.BigramCollocationFinder object at 0x7faa14b89c90>
Getting top 50 collocations for the term: senatus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: sodes
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
Getting top 50 collocations for the term: sodes
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: urbs
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa68aa62c0>
<nltk.collocations.BigramCollocationFinder object at 0x7faa68aa62c0>
Getting top 50 collocations for the term: urbs
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Let's inspect the data structure: each 2-tuple contains:

  • the term itself and
  • a list of 2-tuples: (association_measure, list_of_collocates).
In [30]:
print("a term ==> ", collocs[0][0],
      "\n", "a list of collocation tuples ==> ", collocs[0][1][0] )
a term ==>  civitas 
 a list of collocation tuples ==>  ('chi_sq', [('civitas', 'velovocorum'), ('in', 'civitas'), ('civitas', 'restaurasse'), ('alexandrinae', 'civitas'), ('antiochenae', 'civitas'), ('civitas', 'muratus'), ('primoris', 'civitas'), ('princeps', 'civitas'), ('civitas', 'exterminari'), ('alexandrina', 'civitas'), ('civitas', 'hrofi'), ('inmunitates', 'civitas'), ('platea', 'civitas'), ('rages', 'civitas'), ('civitas', 'lundonia'), ('civitas', 'pergamena'), ('singulasque', 'civitas'), ('augustofratensis', 'civitas'), ('bergistanorum', 'civitas'), ('bitoricae', 'civitas'), ('cimeliarchio', 'civitas'), ('civitas', 'arbee'), ('civitas', 'auferatis'), ('civitas', 'carturi'), ('civitas', 'galaditidis'), ('civitas', 'pirisaboram'), ('clypeam', 'civitas'), ('devastabunt', 'civitas'), ('eboracae', 'civitas'), ('efron', 'civitas'), ('gaditana', 'civitas'), ('singidunum', 'civitas'), ('veronamque', 'civitas'), ('civitas', 'david'), ('coriolos', 'civitas'), ('civitas', 'dono'), ('civitas', 'palmarum'), ('civitas', 'nazareth'), ('civitas', 'iudas'), ('porta', 'civitas'), ('graecia', 'civitas'), ('civitas', 'regnavitque'), ('finitimus', 'civitas'), ('amison', 'civitas'), ('andegavis', 'civitas'), ('astensis', 'civitas'), ('carnain', 'civitas'), ('civitas', 'aegyptiacam'), ('civitas', 'brittiis'), ('civitas', 'irruptus')])

Now we'll retrieve collocations for every period of the time-segmented corpus.

In [31]:
if rebuild == True:
    # retrieve collocations for time slices
    collocs_time = list()
    for key, corp in time2corpus.items():
        # read the corpus into an NLTK-compatible format
        corp_nltk = NltkCorpusFromList(corp)
        print(f"\nBuilding for the corpus: {key}")
        print(f"This corpus contains {len(corp_nltk.sents())} sentences and {len(corp_nltk.words())} words.")
        for term in socio_political_terms:
            print(f"\nBuilding finder for the term: {term}")
            colls = BuildCollocs(corp_nltk, term=term, window=5, filtering=True, top=50)
            colls.getFinder()
            print(f"Getting top 50 collocations for the term: {term}")
            tops = colls.getAllNtops()
            collocs_time.append((key, term, tops))
        
        # saving collocation sets for the next generations
        with open('collocations_all_time.pickle', 'wb') as f:
            pickle.dump(collocs_time,f)

elif rebuild == False:
    with open('collocations_all_time.pickle', 'rb') as f:
        collocs_time = pickle.load(f)
Building for the corpus: 0
This corpus contains 57242 sentences and 1286379 words.

Building finder for the term: civitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: civitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consilium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dc60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dc60>
Getting top 50 collocations for the term: consilium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consul
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: consul
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: dux
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: dux
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: gens
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
Getting top 50 collocations for the term: gens
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: hostis
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: hostis
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: imperator
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
Getting top 50 collocations for the term: imperator
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: jus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: jus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: labor
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
Getting top 50 collocations for the term: labor
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: natio
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
Getting top 50 collocations for the term: natio
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: nobilitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: nobilitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontifex
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
Getting top 50 collocations for the term: pontifex
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontificium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: pontificium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: populus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9d68c4a60>
Getting top 50 collocations for the term: populus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: potestas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
Getting top 50 collocations for the term: potestas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: regnum
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: regnum
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: senatus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: senatus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: sodes
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
Getting top 50 collocations for the term: sodes
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: urbs
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
Getting top 50 collocations for the term: urbs
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building for the corpus: 1
This corpus contains 155541 sentences and 2862731 words.

Building finder for the term: civitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435feb0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435feb0>
Getting top 50 collocations for the term: civitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consilium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435df30>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435df30>
Getting top 50 collocations for the term: consilium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consul
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: consul
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: dux
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435eec0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435eec0>
Getting top 50 collocations for the term: dux
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: gens
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ed70>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ed70>
Getting top 50 collocations for the term: gens
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: hostis
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dc60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dc60>
Getting top 50 collocations for the term: hostis
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: imperator
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ded0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ded0>
Getting top 50 collocations for the term: imperator
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: jus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: jus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: labor
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: labor
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: natio
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: natio
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: nobilitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e6b0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e6b0>
Getting top 50 collocations for the term: nobilitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontifex
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ee90>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ee90>
Getting top 50 collocations for the term: pontifex
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontificium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
<nltk.collocations.BigramCollocationFinder object at 0x7faaa94649d0>
Getting top 50 collocations for the term: pontificium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: populus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e080>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e080>
Getting top 50 collocations for the term: populus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: potestas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ed70>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ed70>
Getting top 50 collocations for the term: potestas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: regnum
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: regnum
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: senatus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: senatus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: sodes
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
<nltk.collocations.BigramCollocationFinder object at 0x7faa6932b970>
Getting top 50 collocations for the term: sodes
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: urbs
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dd20>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dd20>
Getting top 50 collocations for the term: urbs
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building for the corpus: 2
This corpus contains 105557 sentences and 1148908 words.

Building finder for the term: civitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dd20>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dd20>
Getting top 50 collocations for the term: civitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consilium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
Getting top 50 collocations for the term: consilium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: consul
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435f190>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435f190>
Getting top 50 collocations for the term: consul
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: dux
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ee90>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ee90>
Getting top 50 collocations for the term: dux
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: gens
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435df30>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435df30>
Getting top 50 collocations for the term: gens
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: hostis
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
Getting top 50 collocations for the term: hostis
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: imperator
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435fd60>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435fd60>
Getting top 50 collocations for the term: imperator
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: jus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
<nltk.collocations.BigramCollocationFinder object at 0x7faa09b07b50>
Getting top 50 collocations for the term: jus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: labor
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435eec0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435eec0>
Getting top 50 collocations for the term: labor
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: natio
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
Getting top 50 collocations for the term: natio
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: nobilitas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e050>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e050>
Getting top 50 collocations for the term: nobilitas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontifex
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dfc0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dfc0>
Getting top 50 collocations for the term: pontifex
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: pontificium
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e080>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435e080>
Getting top 50 collocations for the term: pontificium
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: populus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dcc0>
Getting top 50 collocations for the term: populus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: potestas
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dd20>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dd20>
Getting top 50 collocations for the term: potestas
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: regnum
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dfc0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435dfc0>
Getting top 50 collocations for the term: regnum
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: senatus
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ed70>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ed70>
Getting top 50 collocations for the term: senatus
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: sodes
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c63119c0>
Getting top 50 collocations for the term: sodes
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

Building finder for the term: urbs
applying term filter
applied term filter
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ded0>
<nltk.collocations.BigramCollocationFinder object at 0x7fa9c435ded0>
Getting top 50 collocations for the term: urbs
Getting finder
Got finder
Getting n n-grams with  chi_sq
Getting n n-grams with  dice
Getting n n-grams with  fisher
Getting n n-grams with  jaccard
Getting n n-grams with  likelihood_ratio
Getting n n-grams with  mi_like
Getting n n-grams with  phi_sq
Getting n n-grams with  pmi
Getting n n-grams with  poisson_stirling
Getting n n-grams with  raw_freq
Getting n n-grams with  student_t

The structure of the collocs_time is similar to collocs, as the list contains 3-tuples of (period, term, list_of_collocs):

[ ( period_0, term_0, [(measure_0, [collocs_0])] ) ]

In [35]:
print("period ==> ", collocs_time[0][0], "\n",
      "term ==> ", collocs_time[0][1], "\n",
      "a list of collocation tuples ==> ", collocs_time[0][2][0] )
period ==>  0 
 term ==>  civitas 
 a list of collocation tuples ==>  ('chi_sq', [('civitas', 'dono'), ('princeps', 'civitas'), ('primoris', 'civitas'), ('bergistanorum', 'civitas'), ('gaditana', 'civitas'), ('foederatus', 'civitas'), ('in', 'civitas'), ('status', 'civitas'), ('civitas', 'donarunt'), ('civitas', 'pergamena'), ('absque', 'civitas'), ('altiusque', 'civitas'), ('amantini', 'civitas'), ('ascribi', 'civitas'), ('avaritiaque', 'civitas'), ('aveniensem', 'civitas'), ('calventi', 'civitas'), ('ccicc', 'civitas'), ('certim', 'civitas'), ('civitas', 'admonitosque'), ('civitas', 'amicitae'), ('civitas', 'ancillaris'), ('civitas', 'annitebatur'), ('civitas', 'apelaurum'), ('civitas', 'caeritem'), ('civitas', 'camertinum'), ('civitas', 'capenatiumque'), ('civitas', 'celeiates'), ('civitas', 'cerdiciatesque'), ('civitas', 'chaldaeicum'), ('civitas', 'classiumque'), ('civitas', 'conmunicatis'), ('civitas', 'conpensatio'), ('civitas', 'delerant'), ('civitas', 'depeculatus'), ('civitas', 'descendissetque'), ('civitas', 'dicarit'), ('civitas', 'dictique'), ('civitas', 'discive'), ('civitas', 'donatus'), ('civitas', 'epiros'), ('civitas', 'exinanitae'), ('civitas', 'explorandumque'), ('civitas', 'exterminarint'), ('civitas', 'extrueret'), ('civitas', 'foederatisque'), ('civitas', 'frusinates'), ('civitas', 'gemituque'), ('civitas', 'gravisque'), ('civitas', 'hampsicoram')])

Convert data for easy access¶

Let's now convert these data structures to dictionary for easier access.

In [36]:
# convert list of tuples into dict
collocs_dict = {}
for item in collocs:
    term = item[0]
    collocs_set = item[1]
    if collocs_dict.setdefault(term) is None:
        collocs_dict.setdefault(term)
        collocs_dict[term] = {}
    for coll in collocs_set:
        coeff = coll[0] # dice etc.
        colls = coll[1] # a list of collocs
        colls_only = [ list(filter(lambda w: w!=term, bigram)) for bigram in coll[1] ] # only collocates
        colls_only = [ x[0] if len(x) > 0 else term for x in colls_only ] # restores collocates which = term
        collocs_dict[term].setdefault(coeff, [])
        collocs_dict[term][coeff] = []
        collocs_dict[term][coeff].append(colls)
        collocs_dict[term][coeff].append(colls_only)

The collocations for each term and association_measure may be accessed as: collocs_dict[term][association_measure]. For example:

In [37]:
# inspect the dictionary
print(collocs_dict["civitas"]["dice"])
[[('princeps', 'civitas'), ('in', 'civitas'), ('civitas', 'david'), ('porta', 'civitas'), ('de', 'civitas'), ('civitas', 'iudas'), ('civitas', 'dono'), ('civitas', 'romanus'), ('noster', 'civitas'), ('civitas', 'suus'), ('per', 'civitas'), ('totus#2', 'civitas'), ('idem', 'civitas'), ('apud', 'civitas'), ('provincia', 'civitas'), ('graecia', 'civitas'), ('universus', 'civitas'), ('episcopus', 'civitas'), ('ad', 'civitas'), ('civitas', 'et'), ('omnis', 'civitas'), ('civitas', 'quis#2'), ('civitas', 'noster'), ('multus', 'civitas'), ('singulus', 'civitas'), ('murus', 'civitas'), ('et', 'civitas'), ('civitas', 'capio'), ('hic', 'civitas'), ('civitas', 'civitas'), ('civitas', 'sanctus'), ('civitas', 'in'), ('civitas', 'rex'), ('civitas', 'do'), ('civitas', 'mitto'), ('civitas', 'omnis'), ('civitas', 'cum'), ('primoris', 'civitas'), ('civitas', 'episcopus'), ('asia', 'civitas'), ('liber', 'civitas'), ('rex', 'civitas'), ('usque', 'civitas'), ('civitas', 'atque'), ('venio', 'civitas'), ('quidam', 'civitas'), ('aedifico', 'civitas'), ('hierosolyma', 'civitas'), ('civitas', 'magnus'), ('civitas', 'murus')], ['princeps', 'in', 'david', 'porta', 'de', 'iudas', 'dono', 'romanus', 'noster', 'suus', 'per', 'totus#2', 'idem', 'apud', 'provincia', 'graecia', 'universus', 'episcopus', 'ad', 'et', 'omnis', 'quis#2', 'noster', 'multus', 'singulus', 'murus', 'et', 'capio', 'hic', 'civitas', 'sanctus', 'in', 'rex', 'do', 'mitto', 'omnis', 'cum', 'primoris', 'episcopus', 'asia', 'liber', 'rex', 'usque', 'atque', 'venio', 'quidam', 'aedifico', 'hierosolyma', 'magnus', 'murus']]

Synchronic collocations¶

Let's assume that the overlap between collocational sets of two or more words is indicative of their (dis)similarity. In this section, we are investigating synchronic collocation sets, that is collocations retrieved from the entire corpus. In the next sections, we'll be taking a closer look at diachronic overlap.

We choose for further analyses collocates retrieved with Dice coefficient as it usually yields the most interpretable results in manual corpus analysis (ie. content words, easy to understand syntagmatic and paradigmatic relation to node word).

In [38]:
labels = []
coll_sets = []
for coll_set in collocs_dict.items():
    labels.append(coll_set[0])
    coll_sets.append(coll_set[1]["dice"][1])
df = collDiffs.collDf(coll_sets, labels) #the df variable will be recycled
df.head()
Out[38]:
colloc slice rank
0 princeps civitas 1
1 in civitas 2
2 david civitas 3
3 porta civitas 4
4 de civitas 5

Collocational overlap and semantic similarity¶

Let's look which terms overlap the most and the least.

In [39]:
# plotting looong ranking tables
def showLongTable(dataframe, show=True, color=None, colormap = None):    
    dataframe.iteritems()
    rowcolors = None
    if color is not None and colormap is not None:
        rowcolors = [ colormap[i] for i in dataframe[color] ]
    tbl = go.Table(
        header=dict(values=list(dataframe.columns),
                    #fill_color='white',
                    align='center'),
        cells=dict(values= [data for (col, data) in dataframe.iteritems()],
                   fill_color=[rowcolors, "white", "white"],
                   line_color=[rowcolors] if colormap is not None else None,
                   align='center'))
    
    return go.Figure(data=[tbl]).show() if show==True else tbl
In [40]:
# most ...
fig1 = showLongTable(collDiffs.getNTop(df, top=5, ascending=False), show=False,
                     color="node", 
                     colormap=color_discrete_map_terms)
# ... and least similar by number of overlapping collocations
fig2 = showLongTable(collDiffs.getNTop(df, top=5, ascending=True), show=False,
                        color="node",
                     colormap=color_discrete_map_terms)

# plot both tables
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "table"},{"type": "table"}]])
fig.add_trace(fig1, row=1, col=1)
fig.add_trace(fig2, row=1, col=2)
fig.update_layout(height=800,
                  title_text="5 most and least similar terms by number of overlapping collocates\
                  <br><sup>(in the 'least' table only 5 results are printed)</sup>"
                 )
fig.show()

In terms of collocational overlap count, the most similar pairs of terms in our set are:

In [41]:
# 10 most similar word pairs in the set (by colloc overlap)
sims_all = collDiffs.getNTop(df, top=-1, ascending=False).sort_values("count",ascending=False).drop_duplicates(
    subset=["node","collocate"])
sims_all[sims_all["node"] != sims_all["collocate"]]
dupes = sims_all.apply(frozenset,axis=1).duplicated()
sims = sims_all[~dupes]
sims.nlargest(10,"count")
Out[41]:
node count collocate
68 gens 15 populus
0 civitas 13 populus
1 civitas 13 urbs
2 civitas 12 gens
85 hostis 11 dux
187 pontifex 10 consul
307 urbs 10 hostis
17 consilium 9 populus
224 populus 9 senatus
71 gens 7 regnum

The least similar pairs, on the other hand, are:

In [42]:
# 10 most dissimilar terms in the set (by colloc overlap)
sims.nsmallest(10,"count")
Out[42]:
node count collocate
300 sodes 0 hostis
271 regnum 0 pontifex
321 urbs 0 pontifex
320 urbs 0 nobilitas
319 urbs 0 natio
270 regnum 0 hostis
318 urbs 0 jus
252 potestas 0 labor
288 senatus 0 pontifex
285 senatus 0 labor

Let's inspect the collocational overlap of the entire set of terms (~ semantic similarity).

In [43]:
# plot similarity matrix
heatmap = collDiffs.plotCollDf(df, show=False)
heatmap.update_xaxes(title="Term").update_yaxes(title="Term")
heatmap.update_layout(height=800, 
                      title_text="Number of overlapping collocations")
heatmap.show()
In [44]:
# TODO: normalize counts
# TODO: overlaps by the collocate's rank
# TODO: limit display to count > 1

Distributional similarity via collocational overlap may be used to discover term clusters.

In [45]:
# default linkage method = complete
dendro1 = ff.create_dendrogram(collDiffs.all2all(coll_sets)[2], 
                              orientation='left', labels=labels)
# let's switch linkage method to Ward
dendro2 = ff.create_dendrogram(collDiffs.all2all(coll_sets)[2], 
                              orientation='left', labels=labels, linkagefun=lambda x: sch.linkage(x,'ward'))
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.3)

for data in dendro1['data']:
    fig.add_trace(data, row=1,col=1)
fig.update_xaxes(
    title_text="linkage = complete", row=1, col=1,ticktext=dendro1.layout.yaxis.ticktext,
    showticklabels=False)
fig.update_yaxes(row=1, col=1,ticktext=dendro1.layout.yaxis.ticktext, tickvals=dendro1.layout.yaxis.tickvals)

for data in dendro2['data']:
    fig.add_trace(data, row=1, col=2)
fig.update_xaxes(
    title_text="linkage = Ward", row=1, col=2,
    showticklabels=False)
fig.update_yaxes(row=1, col=2,ticktext=dendro2.layout.yaxis.ticktext, tickvals=dendro2.layout.yaxis.tickvals)


fig.update_layout(title="Similar terms clustered by the number of overlapping collocates",
                  showlegend=False, height=500, width=900)
fig.show()

Theoretically, each of the socio_political_terms can have (len(socio_political_terms) -1) * ntop (where ntop is a number of top collocates taken into consideration) overlapping collocates. This fact may be used to gauge both the semantic coherence of the set as well as "prototypicality" of each of the terms or their semantic distance from other terms in the set.

Let's inspect total overlap counts for all terms in the set.

In [46]:
# terms by total number of overlapping collocates (~ set coherence)
sims_all_sum = sims_all[["node", "count"]].groupby("node").sum().reset_index().sort_values("count", ascending=False)
sims_med = sims_all_sum["count"].median()
fig = px.bar(sims_all_sum, x='node', y='count',
             #orientation="h",
             category_orders=[{"node":sims_all_sum["node"]}],
             color_discrete_map=color_discrete_map_terms)
fig.add_hline(y=sims_med, line_color="red", line_dash="dash",
              annotation_text="median = "+str(sims_med), 
              annotation_position="bottom right",annotation_font_color="red")
fig.update_layout(title="Overall number of overlapping collocates", height=400)
fig.show()
In [47]:
# TODO: check how it changes with ranks = [10, 20, ...]

Diachronic collocations¶

Let's now turn to time-aware collocation counts.

Data manipulation¶

As before, to make data manipulation easier, we'll start with converting collocation lists to a Python dictionary.

In [48]:
# convert list of tuples into dict
collocs_time_dict = {}
for item in collocs_time:
    period = item[0]
    term = item[1]
    if collocs_time_dict.setdefault(period) is None:
        collocs_time_dict.setdefault(period)
        collocs_time_dict[period] = {}
    collocs_time_dict[period].setdefault(term, {})
    #print(collocs_time_dict)
    for coll in item[2]:
        coeff = coll[0] # dice etc.
        colls = coll[1] # a list of collocs
        collocs_time_dict[period][term].setdefault(coeff, [])
        
        colls_only = [ list(filter(lambda w: w!=term, bigram)) for bigram in coll[1] ] # only collocates
        colls_only = [ x[0] if len(x) > 0 else term for x in colls_only ] # restores collocates which = term
        
#        collocs_time_dict[period][term][coeff] = colls
        collocs_time_dict[period][term][coeff].append(colls)
        collocs_time_dict[period][term][coeff].append(colls_only)

The collocations for each period, term and association_measure may be accessed as: collocs_time_dict[period][term][association_measure]. The 2-tuple thus retrieved contains the original list of 2-grams and a simplified list of collocates. For example:

In [49]:
# inspect the dictionary
print("original list ==> ", collocs_time_dict[0]["civitas"]["dice"][0], "\n\n", "collocates only ==> ", collocs_time_dict[0]["civitas"]["dice"][1])
original list ==>  [('princeps', 'civitas'), ('civitas', 'dono'), ('noster', 'civitas'), ('in', 'civitas'), ('graecia', 'civitas'), ('civitas', 'suus'), ('liber', 'civitas'), ('status', 'civitas'), ('primoris', 'civitas'), ('hic', 'civitas'), ('de', 'civitas'), ('ex', 'civitas'), ('asia', 'civitas'), ('civitas', 'noster'), ('civitas', 'do'), ('muto', 'civitas'), ('civitas', 'mitto'), ('civitas', 'status'), ('unus', 'civitas'), ('civitas', 'princeps'), ('omnis', 'civitas'), ('civitas', 'quis#2'), ('alius', 'civitas'), ('jus', 'civitas'), ('ceterus', 'civitas'), ('nullus', 'civitas'), ('civitas', 'impero'), ('libertas', 'civitas'), ('vir', 'civitas'), ('universus', 'civitas'), ('rex', 'civitas'), ('civitas', 'sum'), ('finitimus', 'civitas'), ('lex', 'civitas'), ('civitas', 'mos'), ('totus#2', 'civitas'), ('civitas', 'non'), ('civitas', 'rex'), ('summus', 'civitas'), ('civitas', 'omnis'), ('civitas', 'hic'), ('et', 'civitas'), ('is', 'civitas'), ('civitas', 'teneo'), ('civitas', 'libertas'), ('civitas', 'atque'), ('civitas', 'legatus'), ('civitas', 'tollo'), ('homo', 'civitas'), ('consilium', 'civitas')] 

 collocates only ==>  ['princeps', 'dono', 'noster', 'in', 'graecia', 'suus', 'liber', 'status', 'primoris', 'hic', 'de', 'ex', 'asia', 'noster', 'do', 'muto', 'mitto', 'status', 'unus', 'princeps', 'omnis', 'quis#2', 'alius', 'jus', 'ceterus', 'nullus', 'impero', 'libertas', 'vir', 'universus', 'rex', 'sum', 'finitimus', 'lex', 'mos', 'totus#2', 'non', 'rex', 'summus', 'omnis', 'hic', 'et', 'is', 'teneo', 'libertas', 'atque', 'legatus', 'tollo', 'homo', 'consilium']

From the orignal dictionary, we're deriving a number of data containers, mainly to simplify visualization.

In [68]:
collocs_time_by_term = dict()
for period, collocs in collocs_time_dict.items():
    for term in collocs.keys():
        #period_lbl="period"+str(period)
        period_lbl=periods_labels[period]
        collocs_time_by_term.setdefault(term,{})
        collocs_time_by_term[term].setdefault(period_lbl, [])
        collocs_time_by_term[term][period_lbl] = collocs[term]["dice"][1]
print("collocs_time_by_term is a Python ", type(collocs_time_by_term), "\n" , "collocs_time_by_term[term][period] ==> a list of the collocates of the term in specific period")
collocs_time_by_term is a Python  <class 'dict'> 
 collocs_time_by_term[term][period] ==> a list of the collocates of the term in specific period
In [69]:
# generate (node,collocate,rank) dataframes
collocs_time_by_term_dfs = dict.fromkeys(collocs_time_by_term.keys()) # dictionary of overlap counts
periods = []
for term in collocs_time_by_term_dfs.keys():
    coll_sets_time = []
    for period, colls in collocs_time_by_term[term].items():
        periods.append(period)
        coll_sets_time.append(colls)
    collocs_time_by_term_dfs[term]= collDiffs.collDf(coll_sets = coll_sets_time, labels=periods)
print("collocs_time_by_term_dfs is a ", type(collocs_time_by_term_dfs), "\n" , "collocs_time_by_term_dfs[term] ==> df of the collocates with time period and rank")
collocs_time_by_term_dfs["civitas"].head()
collocs_time_by_term_dfs is a  <class 'dict'> 
 collocs_time_by_term_dfs[term] ==> df of the collocates with time period and rank
Out[69]:
colloc slice rank
0 princeps -450-0 1
1 dono -450-0 2
2 noster -450-0 3
3 in -450-0 4
4 graecia -450-0 5

Several contain explicit overlap counts.

In [70]:
# overlap counts: by term by period
collocs_time_by_term_mats = dict.fromkeys(collocs_time_by_term.keys()) # dictionary of overlap counts
for term in collocs_time_by_term_mats.keys():
    coll_sets_time = []
    periods = []
    for period, colls in collocs_time_by_term[term].items():
        periods.append(period)
        coll_sets_time.append(colls)
    collocs_time_by_term_mats[term]= collDiffs.all2all(coll_sets = coll_sets_time)[2]

print("collocs_time_by_term_mats is a ", type(collocs_time_by_term_mats), "\n" ,
      "collocs_time_by_term_mats[term] ==> 2D matrix of overlap counts (n_periods, n_periods);\ each row represents total overlap counts of a specific period with other periods\n")

for i, overlap in enumerate(collocs_time_by_term_mats["civitas"]):
    print("period: ", i, "overlap count: ", overlap)
collocs_time_by_term_mats is a  <class 'dict'> 
 collocs_time_by_term_mats[term] ==> 2D matrix of overlap counts (n_periods, n_periods);\ each row represents total overlap counts of a specific period with other periods

period:  0 overlap count:  [43 14  7]
period:  1 overlap count:  [14 42 18]
period:  2 overlap count:  [ 7 18 45]
In [71]:
# overlap counts: by term: (preceding, following)
collocs_time_by_term_overlap = dict.fromkeys(collocs_time_by_term.keys(),[])
for term in collocs_time_by_term_overlap.keys():
    term_overlap = list() # number of overlapping collocates between (preceding, following) periods
    for i, overlap in enumerate(collocs_time_by_term_mats[term]):
        pre = overlap[i-1] if i > 0 else None
        post = overlap[i+1] if i < len(overlap)-1 else None
        term_overlap.append((pre,post))
    collocs_time_by_term_overlap[term] = term_overlap

# None is set for extreme left and right
#collocs_time_by_term_overlap["civitas"]
for i, overlap in enumerate(collocs_time_by_term_overlap["civitas"]):
    print("period: ", i, "==> overlap with preceding and following period: ", overlap)
period:  0 ==> overlap with preceding and following period:  (None, 14)
period:  1 ==> overlap with preceding and following period:  (14, 18)
period:  2 ==> overlap with preceding and following period:  (18, None)
In [72]:
# long-format df with overlap counts: term | t_i-1 | t_i | overlap count
overs = []
for term in collocs_time_by_term_mats.keys():
    for period1, overlaps in zip(periods, collocs_time_by_term_mats[term]):
        for period2, overlap in zip(periods, overlaps):
            if period1 != period2:
                overs.append([term, period1, period2, overlap])
overs_df = pd.DataFrame(overs, columns=["term", "source", "target", "count"])
dupes = overs_df.apply(frozenset,axis=1).duplicated() #filter out dupes
overs_df = overs_df[~dupes]

overs_df["term"] = overs_df["term"].astype("category")
# we're making sure the categories are encoded in the same way
overs_df["target"] = pd.Series(overs_df["target"]).astype('category').cat.set_categories(periods)
overs_df["source"] = pd.Series(overs_df["source"]).astype('category').cat.set_categories(periods)
overs_df.head()
#overs_df.tail()
Out[72]:
term source target count
0 civitas -450-0 0-450 14
1 civitas -450-0 450-900 7
3 civitas 0-450 450-900 18
6 consilium -450-0 0-450 19
7 consilium -450-0 450-900 5

Diachronic collocation overlap¶

We assume that diachronic collocational overlap (ie. the number of shared collocations between period t_i and t_j) is proportional to semantic similarity of word occurrences and thus indicates the degree of semantic change.

Let's explore this, first, by plotting an overlap (ie. similarity) matrix for each term in our set.

Diachronic overlap by term: heatmap¶

In [73]:
# we'll first define plotting function to facilitate multiplot generation
def build_multiplot(cols, subplot_type, n_items, subplot_titles, **kwargs):
    rows = divmod(n_items, cols)[0] + 1 if divmod(n_items, cols)[1] > 0 else divmod(n_items, cols)[0]
    rows_cols = [ (col, row) for col, row in itertools.product(range(1,rows+1), range(1,cols+1)) ]
    specs =  [ [ {"type": subplot_type} for col in range(1, cols+1) ] for i in range(1, rows+1) ] if subplot_type is not None else None
    
    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=subplot_titles if subplot_titles is not None else None,
        specs = specs,
        **kwargs
    )
    
    return fig, rows_cols
In [74]:
# plot heatmaps for every term
heatmaps = {}
for term, df in collocs_time_by_term_dfs.items():
    fig = collDiffs.plotCollDf(df, show=False)
    fig=fig.update_layout(title="Collocational overlap: "+ term)
    heatmaps.setdefault(term,fig)

fig, rows_cols = build_multiplot(2, None, len(heatmaps), [ term for term in heatmaps.keys() ],
                                 shared_yaxes=True, shared_xaxes=False, vertical_spacing=0.04)

for i, heatmap in enumerate(heatmaps.items()):
    fig.add_trace(heatmap[1]["data"][0], row=rows_cols[i][0], col=rows_cols[i][1])

fig.update_layout(height=1600, showlegend=True)
fig.update_xaxes(
                type="category",
                #title_text="Period",
                categoryorder="category ascending",
            )
fig.update_yaxes(type="category",
                 #title_text="Period",
                 categoryorder="category descending")
fig.show()

Diachronic overlap by term: line plot¶

In [112]:
# period_i:period_i+1 overlap dataframe
def simple_overlap(overlap_dict, periods):
    periods_diffs = [ x+" : "+y for x,y in itertools.pairwise(periods)]    
    overlaps = list()
    for term, overlap_arr in overlap_dict.items():
        #overlaps.setdefault(term,)
        for i, per_diff in enumerate(overlap_arr[0:len(overlap_arr)-1]):
            overlaps.append([term, periods_diffs[i], per_diff[1]])
    df = pd.DataFrame(overlaps)
    df.columns = ["term", "period", "overlap"]
    return df
In [113]:
df = simple_overlap(collocs_time_by_term_overlap, periods)

# all
fig_all = px.line(df, x="period",y="overlap",
                  color="term", color_discrete_map=color_discrete_map_terms,
                 ) # filter out empty period
fig_all.update_layout(title="Number of overlapping collocations", height=600)
fig_all.show()

# TODO: plot change ratio instead of change count
In [114]:
# by term
df_term = df
#df_term["period"] = [ for t1, t2 in df_term["period"].map(lambda x: x.split(':')) ] #shorter labels
fig_by_term = px.line(df_term,
                      x="period", y="overlap", color="term", # filter out empty period
                      color_discrete_map=color_discrete_map_terms,
                      facet_col="term", facet_col_wrap=5,                      
                     )
fig_by_term.update_layout(title="Number of overlapping collocations", height=800, showlegend=False)
fig_by_term.update_xaxes(ticks='', showticklabels=True, tickangle=45, tickfont=dict(size=8))
fig_by_term.show()
 
In [116]:
# by period ()
fig = px.box(df,
             y="overlap", facet_col="period", facet_col_wrap=4, facet_col_spacing=0,
             notched=True, points="all", hover_data=["term", "overlap", "period"],
             color="period", boxmode="overlay",
            )
fig.update_traces(jitter=0, showlegend=False)
fig.update_layout(title="Variation of the overlap counts (by period)", height=400, showlegend=False)
fig.show()
In [117]:
# by word
fig = px.box(df,
             y="overlap", x="term",
             #facet_col="period",
             notched=False, 
             points="all",
             hover_data=["term", "overlap", "period"],
             color="term", color_discrete_map=color_discrete_map_terms
            )
fig.update_traces(jitter=0, showlegend=False)
fig.update_layout(title="Variation of the overlap counts (by term)", height=300, showlegend=False)
fig.show()

Diachronic overlap by term: sankey diagram¶

In [118]:
#terms_colors = [ color_discrete_map_terms[t] for t in overs_df_all["term"] ]

fig = go.Figure(data=[go.Sankey(
    node = dict(
      pad = 15,
      line = dict(color = "black", width = 0),
        label =  list(overs_df["source"].unique())
    ),
    # Add links
    link = dict(
      source =  overs_df["source"].cat.codes,
      target =  overs_df["target"].cat.codes,
      value =  overs_df["count"],
      label =  overs_df["term"],
))])

fig.update_layout(title_text="Total diachronic collocation overlap", font_size=20)
fig.show()
In [119]:
# prepare sankey chart of collocational overlap for every term in our set
sankeys = []
for term in socio_political_terms:
    overs_df_this = overs_df[overs_df["term"] == term].copy()
    terms_colors = [ color_discrete_map_terms[t] for t in overs_df_this["term"] ]
    fig = go.Sankey(
        arrangement="snap",
        node = dict(            
            pad = 0,
            thickness = 30,
            line = dict(color = "black", width = 0.5),
            x=[x*10 for x in range(len(periods),1)], #we force the order in which periods are plotted,
            y=[i*10 for i in range(len(periods)) ],
            label =  overs_df_this["source"].unique(),            
        ),
        # Add links
        link = dict(
            source =  overs_df_this["source"].cat.codes,
            target =  overs_df_this["target"].cat.codes,
            value =  overs_df_this["count"],
            line = dict(color = terms_colors, width = 0)
        ))
    sankeys.append(fig)
    
fig, rows_cols = build_multiplot(3, "sankey", len(sankeys), socio_political_terms,
                                 shared_yaxes=True, shared_xaxes=False, vertical_spacing=0.04)

for i, sankey in enumerate(sankeys):
    fig.add_trace(sankey, row=rows_cols[i][0], col=rows_cols[i][1])

fig.update_layout(height=1200, title_text="Diachronic collocation overlap by term")
fig.show()

Clustering diachronic collocations¶

Until now, we have treated diachronic collocation sets as unstructured monoliths. In this section, we resort to word embeddings and clutering techniques in order to investigate fine-grained diachronic changes.

Data manipulation¶

We will employ word2vec embeddings to assess semantic distance between collocates.

In [121]:
model_file = '/home/krzys/Kod/streamlit/voces/data/models/latinise_IT/latinise_w2v_v100w5min5'

if rebuild == True:
    collocs_corpus = CorpusFromDir("/media/HOME_FOLDERS/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas")
    collocs_model = BuildModels(collocs_corpus)
    mods = dict(word2vec=dict(vector_size=[100], # define parameters
                      window=[5],
                      min_count=[5]))
    
    latinise_w2v_v100w5min5 = collocs_model.build_many(mods)
    latinise_w2v_v100w5min5["word2vec"][0].save(model_file)
    
    model = latinise_w2v_v100w5min5["word2vec"][0]

elif rebuild == False:
    model = Word2Vec.load(model_file)

We can now annotate collocates with vectors retrieved from the word2vec model.

In [122]:
all_collocs_time = pd.concat(collocs_time_by_term_dfs) # all collocates for all terms for all periods
In [123]:
collocs = all_collocs_time["colloc"].unique() # only collocates

# find word2vec vectors for each collocate
coll_vecs = [ model.wv[x]
             if x in model.wv.key_to_index.keys() else np.repeat(None, model.wv.vector_size)
             for x in collocs ]

coll_vecs_df=pd.DataFrame(coll_vecs)
coll_vecs_df.index = collocs
not_in_model = coll_vecs_df.isna().all(axis=1)

print( len(coll_vecs_df.loc[not_in_model].index), " words weren't found in the word2vec dictionary, eg. ", [ x for x in coll_vecs_df.loc[not_in_model].index[0:10] ] )

coll_vecs_df = coll_vecs_df.loc[~not_in_model] # exclude collocates which are not found in the word2vec dictionary
coll_vecs_df.head()
193  words weren't found in the word2vec dictionary, eg.  ['ahitofel', 'l.', 'm.', 'c.', 'q.', 'p.', 'a.', 'cn.', 'd.', 't.']
Out[123]:
0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
princeps 1.499599 1.654052 0.772930 1.065428 0.207113 1.949996 0.902307 -0.708749 -0.655868 0.772048 ... -1.696397 -0.042469 -0.265360 1.599047 1.692026 -0.218211 -0.866921 0.780210 -0.327714 0.286641
dono -0.028665 0.206296 -0.636952 -0.976660 -1.009025 1.534521 -0.018601 0.419318 1.731939 0.566920 ... -0.035175 0.146910 1.603500 1.170355 1.732852 -1.009020 0.818321 -0.086363 -0.048242 -1.025009
noster 1.572031 -1.847746 0.483166 1.169696 0.515046 1.342866 4.558844 0.564918 1.128803 -1.010498 ... -1.073775 0.294596 0.294962 0.136875 0.365881 0.431258 -0.813561 1.461542 1.295876 0.666016
in -0.486388 1.539424 -1.682679 1.438433 0.416782 -0.036690 -0.375583 0.840769 0.698870 -0.866383 ... -3.265582 -0.583095 0.264834 -1.441967 1.034583 -0.769255 -1.308601 -0.488309 -0.014694 0.051321
graecia 0.400272 0.584794 0.923566 0.167011 1.140316 0.195778 1.496356 -0.419286 -1.794344 0.467281 ... -0.763984 0.557046 -0.314156 -0.671911 1.037812 0.144784 2.227822 -0.035667 0.219159 -0.333513

5 rows × 100 columns

Cluster computation¶

Each collocate will be assigned to a cluster basing on the vector. In this study, we use k-means clustering with an arbitrary number of 10 clusters.

In [124]:
cluster_n = 10
kmeans = KMeans(n_clusters=cluster_n).fit_predict(coll_vecs_df) # (1) initialize the model and (2) fit and return predictions
kmeans_dict = dict(zip(coll_vecs_df.index ,kmeans)) # for easy access we create a dictionary: {"term", "cluster"}
cluster_color_map = { str(i) : px.colors.qualitative.Safe[i] for i in range(0,cluster_n+1) } # for each term we fix a color

We can now annotate all collocates with cluster label.

In [125]:
all_collocs_time["kmeans_cluster"] = pd.Categorical(all_collocs_time.apply(lambda x: str(kmeans_dict[x["colloc"]]) if x["colloc"] in kmeans_dict.keys() else None, axis=1 ))
all_collocs_time = all_collocs_time.reset_index(level=1, drop=True)
all_collocs_time = all_collocs_time.reset_index()
all_collocs_time = all_collocs_time.rename(columns={"index":"term"})
all_collocs_time
Out[125]:
term colloc slice rank kmeans_cluster
0 civitas princeps -450-0 1 7
1 civitas dono -450-0 2 2
2 civitas noster -450-0 3 8
3 civitas in -450-0 4 4
4 civitas graecia -450-0 5 7
... ... ... ... ... ...
2733 urbs iuxta 450-900 46 4
2734 urbs episcopus 450-900 47 6
2735 urbs tunc 450-900 48 4
2736 urbs castrum 450-900 49 1
2737 urbs beo 450-900 50 6

2738 rows × 5 columns

Distribution of collocation clusters through time¶

In [126]:
fig = px.histogram(all_collocs_time.sort_values("kmeans_cluster"), x="slice",
                   color="kmeans_cluster", color_discrete_map=cluster_color_map,
                   barmode="stack",
                   category_orders={"kmeans_cluster":'category ascending'},
                   )
fig.update_xaxes(title="Period", categoryorder="category ascending" )
fig.update_yaxes(title="Count")
fig.update_layout(title="Diachronic distribution of collocational clusters (all terms)")
fig.show()
In [127]:
fig = px.histogram(all_collocs_time.sort_values("kmeans_cluster"),
                   x="slice", color=("kmeans_cluster"), color_discrete_map=cluster_color_map,
                   barmode="stack",
                   category_orders={"kmeans_cluster":'category ascending'},
                   facet_col="term",
                   facet_col_wrap=3,
                   facet_row_spacing=0.06,
                   facet_col_spacing=0.06,)
fig.update_xaxes(title="Period", categoryorder="category ascending" )
fig.update_yaxes(title="Count")
fig.update_layout(height=1400, showlegend=True, title="Diachronic distribution of collocational clusters (by term)")
fig.show()
In [128]:
fig = px.histogram(all_collocs_time, x="slice", color="kmeans_cluster", barmode="group",
                   facet_col="term", color_discrete_map=cluster_color_map,
                   facet_col_wrap=2,
                   category_orders={"kmeans_cluster":[str(i) for i in range(0,cluster_n+1) ]},
                   facet_row_spacing=0.06,
                   facet_col_spacing=0.06,
                   )
                  
fig.update_xaxes(title="Period", showticklabels=True)
fig.update_yaxes(title="Count")
fig.update_layout(height=2000, showlegend=True, title="Diachronic distribution of collocational clusters (by term)")
fig.show()
In [129]:
df = pd.DataFrame(all_collocs_time.groupby(["slice", "term"])["kmeans_cluster"].value_counts().reset_index())
df.columns = ["slice", "term","kmeans_cluster", "count"]
#df[df["term"] == 'civitas'].head()

fig = px.area(df, x="slice", color="kmeans_cluster", color_discrete_map=cluster_color_map,
                  y="count", category_orders={"kmeans_cluster":[str(i) for i in range(0,cluster_n+1) ]},
                   facet_col="term", facet_col_wrap=4,
                   facet_row_spacing=0.06, facet_col_spacing=0.04,
                   )
fig.update_xaxes(title="", categoryorder="category ascending", showticklabels=True, tickangle=45)
fig.update_layout(height=1400, showlegend=True, title="Diachronic distribution of collocational clusters (by term)")
fig.show()
In [131]:
fig = px.histogram(all_collocs_time,
                   x="kmeans_cluster", color="kmeans_cluster",
                   barmode="group",
                   facet_col="term",
                   color_discrete_map=cluster_color_map,
                   facet_col_wrap=5, 
                   animation_frame="slice", animation_group="kmeans_cluster",
                   category_orders={"kmeans_cluster":[str(i) for i in range(0,cluster_n+1) ]},
                   facet_row_spacing=0.04, facet_col_spacing=0.04,
                   )

fig.update_layout(height=800, showlegend=True, title="Interactive diachronic distribution of collocational clusters (by term)")
fig.show()

Variation of collocation clusters through time¶

Let's assume that the distribution of collocational clusters of a term at time_i corresponds to its semantics at this point in time. Diachronic variation of the counts may help us in assessing if the term was subject to important sense changes.

TODO It might be better, though, to evaluate the cross-cluster variation rather than one-cluster counts.

In [132]:
fig = px.box(df.sort_values("kmeans_cluster"), x="kmeans_cluster",
             color="kmeans_cluster", color_discrete_map=cluster_color_map,
             y="count",
             facet_col="term", facet_col_wrap=2,
             facet_row_spacing=0.02, facet_col_spacing=0.02,
             category_orders={"kmeans_cluster":'category ascending'},
            )
fig.update_xaxes(title="", categoryorder="category ascending", showticklabels=True, tickangle=45)
fig.update_layout(height=2000, showlegend=True, title="Variation of collocational clusters counts (by cluster)")
fig.show()
In [133]:
# TODO: variation by genre

Semantic relatedness of collocations through time¶

Data manipulation¶

We're evaluating semantic (dis)similarity of collocates based on vectors retrieved from the word2vec model. To facilitate analyses, we are building similarity matrix for all collocates at once.

In [134]:
print("The model we'll be using: ", model, ".\n")
similarity_matrix = np.matrix([ model.wv.cosine_similarities(vec1, coll_vecs_df) for vec1 in coll_vecs_df.to_numpy() ])
print("Our similarity matrix has shape: ", similarity_matrix.shape, "\n", similarity_matrix[0:2])
#dist_matrix = distance.pdist(np.ndarray([ coll_vecs_df[0:2], coll_vecs_df[0:2]]).dropna(), metric='cosine')
The model we'll be using:  Word2Vec<vocab=40504, vector_size=100, alpha=0.025> .

Our similarity matrix has shape:  (1230, 1230) 
 [[1.         0.29489333 0.25937001 ... 0.33302709 0.1681883  0.3654971 ]
 [0.29489333 1.         0.13147971 ... 0.01036476 0.03372622 0.21049999]]

Also, for plotting purposes, we are reducing vectors to 2 dimensions with the t-SNE.

In [135]:
tsne_df = all_collocs_time[["colloc", "slice", "term", "rank"]].reset_index(drop=True)
tsne_df = tsne_df.set_index("colloc", drop=False)
tsne_df["vec"] = [ coll_vecs_df.loc[colloc] if colloc in coll_vecs_df.index else None for colloc in tsne_df["colloc"] ]
tsne_df = tsne_df[ ~ tsne_df["vec"].isna()] # exclude collocates not in the word2vec dict

# fit TSNE
vecs = tsne_df["vec"]
vecs = np.array([ vec for vec in vecs ])
tsne = TSNE(n_components=2, random_state=0, init='pca', learning_rate="auto", metric="cosine", perplexity=len(periods))
coords = tsne.fit_transform( vecs )

# add (x,y) coordinates to each row
tsne_df["x"] = [ coord[0] for coord in coords ]
tsne_df["y"] = [ coord[1] for coord in coords ]
tsne_df["rank"] = pd.to_numeric(tsne_df["rank"])

tsne_df.head()
/home/krzys/miniconda3/envs/lvlt22/lib/python3.10/site-packages/sklearn/manifold/_t_sne.py:991: FutureWarning:

The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.

Out[135]:
colloc slice term rank vec x y
colloc
princeps princeps -450-0 civitas 1 0 1.499599 1 1.654052 2 0.772930 3... 96.779152 110.710159
dono dono -450-0 civitas 2 0 -0.028665 1 0.206296 2 -0.636952 3... -63.424732 -34.199081
noster noster -450-0 civitas 3 0 1.572031 1 -1.847746 2 0.483166 3... -99.519875 102.313583
in in -450-0 civitas 4 0 -0.486388 1 1.539424 2 -1.682679 3... -116.270370 -79.258522
graecia graecia -450-0 civitas 5 0 0.400272 1 0.584794 2 0.923566 3... 87.686058 -13.885004

Semantic relatedness of diachronic collocations: zoom on node word¶

Diachronic collocations may be clustered based on their semantic similarity.

In [136]:
# all: only strongest collocates
topn = 10
tsne_df[tsne_df["rank"] <= topn ]
fig = px.scatter(tsne_df[tsne_df["rank"] <= topn], x="x", y="y", color="slice", text="colloc", facet_col="term",
                 facet_col_wrap=3, facet_row_spacing=0.01)
fig.update_layout(height=2000)
fig.show()

The same data may be visualized dynamically.

In [137]:
# all: only strongest collocates
topn = 20
fig = px.scatter(tsne_df[tsne_df["rank"] <= topn], x="x", y="y", color="slice", text="colloc", facet_col="term",
                 animation_frame="slice",
                 facet_col_wrap=3, facet_row_spacing=0.01)
fig.update_layout(height=2000, showlegend=False)
fig.show()

Usually, it is easier to analyze terms one by one.

In [139]:
# all: only strongest collocates
topn = 30
term = 'civitas'
fig = px.scatter(tsne_df[ (tsne_df["rank"] <= topn) & (tsne_df["term"] == term) ], x="x", y="y", color="slice", text="colloc",
                 animation_frame="slice",
                
                )
fig.update_layout(height=600, showlegend=False)
fig.show()

Semantic relatedness of diachronic collocations: zoom on period¶

Let's evaluate semantic similarity of all collocates of all terms for each period.

TODO The plot is, however, hardly legible and difficult to interpret.

In [140]:
# all: only strongest collocates - by period - unreadable
topn = 10
tsne_df[tsne_df["rank"] <= topn ]
fig = px.scatter(tsne_df[tsne_df["rank"] <= topn], x="x", y="y",
                 color="term", text="colloc", color_discrete_map=color_discrete_map_terms,
                 #facet_col="term",
                 facet_row="slice",
                 facet_col_wrap=2,
                 facet_row_spacing=0.01)
fig.update_layout(height=2000, title="Semantic relatedness of collocates through time")
fig.show()

Semantic change of the diachronic collocation sets¶

Let's assume that semantic value of a collocational set at the point t_i is approximated by the mean of the vectors of its elements.

In [141]:
# TODO: Evaluate if and how the mean is correlated with the rank of collocates included in the set.

First, we calculate vector means for each term-period.

In [142]:
df_sims = tsne_df[~ tsne_df["vec"].isna() ].groupby(["term","slice"])["vec"].aggregate(lambda x: cosine_similarity([y for y in x]) ).reset_index()
df_sims["mean_sim"] = df_sims.apply(lambda x: np.mean(np.tril(x["vec"], k=-1)), axis=1 ) # get the lower triangle of the matrix
df_sims["std_sim"] = df_sims.apply(lambda x: np.std(np.tril(x["vec"], k=-1)), axis=1 )

We are now ready to plot the means for each term for each period. If the mean of collocation set vectors significantly changes, we may suppose the meaning of the term changed as well.

In [143]:
fig = px.line(
    df_sims,
    x="slice",
    y="mean_sim",
    color="term", color_discrete_map=color_discrete_map_terms,
    facet_col="term", facet_col_wrap=3, facet_row_spacing=0.09, facet_col_spacing=0.06
) # collocational set coherence
fig.update_layout(height=1200, showlegend=False)
fig.update_yaxes(title="vectors mean")
fig.update_xaxes(title="period", showticklabels=True, tickangle=45)
fig.show()

If the vectors mean is indicative of the word meaning, we may calculate the cosine similarity of the term-periods and plot the on the 2D plane by reducing the vectors with the t-SNE.

In [146]:
df_sims_vecs = tsne_df[~ tsne_df["vec"].isna() ].copy().groupby(["term","slice"])["vec"].aggregate(vec_mean=lambda x: np.mean( [y for y in x], axis = 0 ) ).reset_index()

# computet coordinates
tsne = TSNE(n_components=2, random_state=0, init='pca', learning_rate="auto", metric="cosine", perplexity=len(periods)+1)
coords = tsne.fit_transform( np.array([ x for x in df_sims_vecs["vec_mean"] ]) )
df_sims_vecs["x"] = [coord[0] for coord in coords]
df_sims_vecs["y"] = [coord[1] for coord in coords]
df_sims_vecs["label"] = df_sims_vecs.apply(lambda x: x["term"] + ":" + x["slice"], axis=1)
df_sims_vecs.head()
/home/krzys/miniconda3/envs/lvlt22/lib/python3.10/site-packages/sklearn/manifold/_t_sne.py:991: FutureWarning:

The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.

Out[146]:
term slice vec_mean x y label
0 civitas -450-0 [0.7331563708465546, 0.5749529541656375, 0.665... 5.269683 -15.128561 civitas:-450-0
1 civitas 0-450 [0.15268317833542824, 0.481530849263072, 0.062... -45.265285 2.392575 civitas:0-450
2 civitas 450-900 [0.13842779921367765, 0.6136744904518128, 0.46... -42.438435 3.893654 civitas:450-900
3 consilium -450-0 [0.6008564168587327, 0.45102046817541125, 0.59... 12.957879 -11.167262 consilium:-450-0
4 consilium 0-450 [0.7520405086023467, 0.19572401635957007, 0.62... 15.846694 -11.784343 consilium:0-450
In [147]:
fig = px.scatter(df_sims_vecs, x="x", y="y",
                 color="term", text="label", color_discrete_map=color_discrete_map_terms,
                )
fig.for_each_trace(lambda t: t.update(textfont_color=t.marker.color, textposition='top center'))
fig.update_traces(mode="text")
fig.update_layout(height=800, showlegend=False, title="Distance of term-periods vector means (t-SNE)")
fig.show()

The cosine similarity of term-periods may be also investigated on a heatmap.

In [148]:
# similarity between periods and terms
df_sims_vecs_cosine = cosine_similarity([ row for row in df_sims_vecs["vec_mean"] ])

labels = [ row for row in df_sims_vecs["label"] ]
fig = go.Figure(
    data=go.Heatmap(
        x=labels,
        y=labels,
        z=df_sims_vecs_cosine
    )
)
fig.update_layout(height=1000, title="Semantic relatedness of term-periods (vector means)")
fig.update_yaxes(categoryorder='category descending')
fig.update_xaxes(categoryorder='category ascending')
fig.show()
In [149]:
# heatmap with dendrogram following the example at https://plotly.com/python/dendrogram/#plot-a-dendrogram-with-a-heatmap

data_array = df_sims_vecs_cosine
data_array = data_array.transpose()
labels = [ row for row in df_sims_vecs["label"] ]

# Initialize figure by creating upper dendrogram
fig = ff.create_dendrogram(data_array, orientation='bottom', labels=labels)
for i in range(len(fig['data'])):
    fig['data'][i]['yaxis'] = 'y2'

# Create Side Dendrogram
dendro_side = ff.create_dendrogram(data_array, orientation='right')
for i in range(len(dendro_side['data'])):
    dendro_side['data'][i]['xaxis'] = 'x2'

# Add Side Dendrogram Data to Figure
for data in dendro_side['data']:
    fig.add_trace(data)

# Create Heatmap
dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves))
data_dist = distance.pdist(data_array)
heat_data = distance.squareform(data_dist)
heat_data = heat_data[dendro_leaves,:]
heat_data = heat_data[:,dendro_leaves]

heatmap = [
    go.Heatmap(
        x = dendro_leaves,
        y = dendro_leaves,
        z = heat_data,
        colorscale = 'Blues'
    )
]

heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']

# Add Heatmap Data to Figure
for data in heatmap:
    fig.add_trace(data)

# Edit Layout
fig.update_layout({'width':800, 'height':800,
                         'showlegend':False, 'hovermode': 'closest',
                         })
# Edit xaxis
fig.update_layout(xaxis={'domain': [.15, 1],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'ticks':""})
# Edit xaxis2
fig.update_layout(xaxis2={'domain': [0, .15],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""})

# Edit yaxis
fig.update_layout(yaxis={'domain': [0, .85],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'showticklabels': False,
                                  'ticks': ""
                        })
# Edit yaxis2
fig.update_layout(yaxis2={'domain':[.825, .975],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""})

# Plot!
fig.update_layout(height=1000, width=1000)
fig.show()
In [150]:
# qualitative flow of collocates
collocs_time_by_term_dfs["civitas"]
Out[150]:
colloc slice rank
0 princeps -450-0 1
1 dono -450-0 2
2 noster -450-0 3
3 in -450-0 4
4 graecia -450-0 5
... ... ... ...
145 ipse 450-900 46
146 ingredior 450-900 47
147 dux 450-900 48
148 igitur 450-900 49
149 vel 450-900 50

150 rows × 3 columns

Collocational similarity (network approach)¶

Let's represent collocations of a term or a set of terms as a graph. The node set comprises of all the collocates of a terms or terms as well as the terms themselves. The edges link collocate nodes with the respective term nodes. The edges are assigned a weight attribute which defines nodes attraction and whose value is inversely proportional to the collocation rank: $1-(rank/max(rank))$.

TODO More rigorous definition.

In [151]:
net_df = pd.DataFrame()
for term, dataf in collocs_time_by_term_dfs.items():
    dataf = dataf.assign(term = pd.Series([x for x in itertools.repeat(term,len(dataf.index))]))
    net_df = pd.concat([net_df, dataf]) 
net_df['slice_term'] = net_df['term'] + '_' + net_df['slice'].str.replace('period','')
net_df = net_df.assign(weight = pd.to_numeric(1-pd.to_numeric(net_df["rank"])/pd.to_numeric(net_df["rank"]).max()))

net_df.head()
Out[151]:
colloc slice rank term slice_term weight
0 princeps -450-0 1 civitas civitas_-450-0 0.98
1 dono -450-0 2 civitas civitas_-450-0 0.96
2 noster -450-0 3 civitas civitas_-450-0 0.94
3 in -450-0 4 civitas civitas_-450-0 0.92
4 graecia -450-0 5 civitas civitas_-450-0 0.90

Since collocate nodes may be linked to multiple terms, intuitively, the network may be used to assess degree of semantic relatedness between the terms. To increase legibility we will plot only 20 strongest collocates,

In [152]:
rank_thresh = 20
g = nx.from_pandas_edgelist(net_df[ pd.to_numeric(net_df["rank"]) <= rank_thresh ],source="colloc", target="term",edge_key="slice",
                            edge_attr=["weight", "slice"], create_using=nx.MultiGraph())

# remove low-degree nodes, ie. nodes that are linked to only 1 node
low_degree = [n for n, d in g.degree() if d < 2]
g.remove_nodes_from(low_degree)
node_colors = [ color_discrete_map_terms[node] if  node in color_discrete_map_terms.keys() else '#808080' for node in g.nodes ]
node_sizes = [ 300 if  node in color_discrete_map_terms.keys() else 0 for node in g.nodes ]
pos = nx.spring_layout(g, seed=1)

plt.figure(1,figsize=(20,20)) 
nx.draw_networkx(g, pos, width=0.1, node_color=node_colors, node_size=node_sizes, font_size=8)

The networks may be inspected period by period.

In [155]:
graphs = []
rank_thresh = 20
for i, period in enumerate(periods[0:len(periods)]):
    g = nx.from_pandas_edgelist(net_df[ (net_df["slice"] == period) & (pd.to_numeric(net_df["rank"]) <= rank_thresh) ],source="colloc", target="term",edge_key="slice",
                            edge_attr=["weight", "slice"], create_using=nx.MultiGraph())
    # remove low-degree nodes
    low_degree = [n for n, d in g.degree() if d < 2]
    g.remove_nodes_from(low_degree)
    node_colors = [ color_discrete_map_terms[node] if  node in color_discrete_map_terms.keys() else '#808080' for node in g.nodes ]
    node_sizes = [ 300 if  node in color_discrete_map_terms.keys() else 0 for node in g.nodes ]
    font_sizes = [ 12 if  node in color_discrete_map_terms.keys() else 4 for node in g.nodes ]
    pos = nx.spring_layout(g, seed=675, k=0.99)
    graphs.append((g, pos, node_colors, node_sizes, period))
In [156]:
ncols = 2
nrows = divmod(len(graphs), ncols)[0] if divmod(len(graphs), ncols)[1] == 0 else divmod(len(graphs), ncols)[0] + 1
print(ncols, nrows)
fig, axs = plt.subplots( nrows , ncols, sharex=True, figsize=(20, 30))

for i, graph in enumerate(graphs):    
    ax = axs.flatten()
    ax[i].set_title(graphs[i][4])
    nx.draw_networkx(graphs[i][0], graphs[i][1], width=0.1, node_color=graphs[i][2], 
                     node_size=graphs[i][3],
                     font_size=10,
                     ax=ax[i])
2 2

Likewise, the (dis)similarity may be evaluated for term-periods. This time, we are using nx.algorithms.community.greedy_modularity_communities algorithm with to detect "communities" (ie. sense clusters) in the network.

TODO Improve legibility or remove.

In [157]:
rank_thresh = 10
g = nx.from_pandas_edgelist(net_df[ pd.to_numeric(net_df["rank"]) <= rank_thresh ],
                            source="colloc", target="slice_term",
                            edge_key="slice",
                            edge_attr=["weight", "slice"], create_using=nx.MultiDiGraph())

# remove low-degree nodes
low_degree = [n for n, d in g.degree() if d < 2]
g.remove_nodes_from(low_degree)

G = g.copy()
communities = nx.algorithms.community.greedy_modularity_communities(G)
communities_cols = [ px.colors.qualitative.Alphabet[i] for i, v in enumerate(communities) ]

communities_dict = {}
for i, comm in enumerate(communities):
    for com in comm:
        communities_dict.setdefault(com, communities_cols[i])
In [158]:
pos = nx.spring_layout(G,k=0.01)
plt.figure(1,figsize=(14,14))

for node in G.nodes:    
    lbl = node
    #lbl = node.split('_')[0]
    col = communities_dict[lbl]
    nx.draw_networkx_labels(G, pos=pos, labels={node:node}, font_color=col)

nx.draw_networkx_nodes(G, pos,
                       node_size=0,
                       label=[ n for n in G.nodes ],
                       node_color = communities_dict.values())
Out[158]:
<matplotlib.collections.PathCollection at 0x7fa994c3b1f0>
In [159]:
# interactive, but slooow!
def plot_pyvis_graph():    
    g_ = network.Network(notebook=True, width="100%")
    g_.from_nx(g)
    g_.show_buttons()

    for node in g_.nodes:
        if node["label"] in color_discrete_map_terms.keys():
            node["color"] = color_discrete_map_terms[node["label"]]

    g_.show('collocs.html')
# uncomment to plotTODOTODO
# plot_pyvis_graph()